In [9]:
import pandas as pd
import pandas_profiling
import numpy as np
In [33]:
# importing the data
import csv
df_raw = pd.read_csv('data/M_query_20200512.csv', sep=',',skipinitialspace=True,quoting=csv.QUOTE_ALL,engine='python', error_bad_lines=False, warn_bad_lines=False)
In [27]:
# importing the data
#import csv
#df_raw = pd.read_csv('data/M_query_20200512.csv', engine='python', sep=',',escapechar=',', quotechar='"', error_bad_lines=False)
In [34]:
df_raw.shape
Out[34]:
(547854, 23)
623808 M_query_20200512.csv, diff = 75954
In [35]:
df_raw.head()
Out[35]:
TrialId TrialGroupId TrialTargetId TrialPhase TrialValue Trial TrialCreatedDate TrialDataDate TrialAuthorId GoalName ... GoalDomain GoalAssessment GoalCreatedDate GoalInitiatedDate GoalMetDate GoalInProgressDate GoalHoldDate GoalDiscontinuedDate GoalDataType GoalPercentCorrectTrend
0 633873157 152165867 14090357 Intervention 0.0 1 2018-06-27 22:08:41.463000000 2018-06-27 17:08:00 648462 When peer enters the room ... Social Verbal Behavior Milestone Assessment and Place... 2017-06-16 16:22:14.040000000 2018-06-27 2018-07-23 NaN NaN NaN datapercent 4.99875
1 633873158 152165867 14090357 Intervention 1.0 2 2018-06-27 22:08:41.467000000 2018-06-27 17:08:00 653043 When peer enters the room ... Social Verbal Behavior Milestone Assessment and Place... 2017-06-16 16:22:14.040000000 2018-06-27 2018-07-23 NaN NaN NaN datapercent 4.99875
2 633873159 152165867 14090357 Intervention 1.0 3 2018-06-27 22:08:41.467000000 2018-06-27 17:08:00 613587 When peer enters the room ... Social Verbal Behavior Milestone Assessment and Place... 2017-06-16 16:22:14.040000000 2018-06-27 2018-07-23 NaN NaN NaN datapercent 4.99875
3 633873160 152165867 14090357 Intervention 0.0 4 2018-06-27 22:08:41.470000000 2018-06-27 17:08:00 613587 When peer enters the room ... Social Verbal Behavior Milestone Assessment and Place... 2017-06-16 16:22:14.040000000 2018-06-27 2018-07-23 NaN NaN NaN datapercent 4.99875
4 633873161 152165867 14090357 Intervention 1.0 5 2018-06-27 22:08:41.470000000 2018-06-27 17:08:00 613587 When peer enters the room ... Social Verbal Behavior Milestone Assessment and Place... 2017-06-16 16:22:14.040000000 2018-06-27 2018-07-23 NaN NaN NaN datapercent 4.99875

5 rows × 23 columns

In [30]:
df_raw.describe()
Out[30]:
TrialId TrialGroupId TrialTargetId TrialValue Trial TrialAuthorId ClientId GoalPercentCorrectTrend
count 5.478540e+05 5.478540e+05 5.478540e+05 547854.000000 547854.000000 5.478540e+05 5.478540e+05 547854.000000
mean 1.653955e+09 3.818736e+08 4.959252e+07 0.687265 5.328792 6.812767e+05 6.378015e+05 2.255574
std 7.356845e+08 1.668407e+08 2.139539e+07 0.463608 6.215507 2.520986e+05 2.450082e+05 5.291559
min 3.127852e+08 7.951781e+07 1.061140e+07 0.000000 1.000000 8.100600e+04 8.855000e+04 -45.000000
25% 1.001750e+09 2.345342e+08 3.239924e+07 0.000000 2.000000 4.831380e+05 4.542440e+05 0.000000
50% 1.650829e+09 3.790394e+08 4.560698e+07 1.000000 4.000000 6.704560e+05 6.090000e+05 0.277777
75% 2.272206e+09 5.214655e+08 6.557220e+07 1.000000 6.000000 8.602480e+05 8.031480e+05 2.142857
max 2.986038e+09 6.851508e+08 1.020322e+08 1.000000 144.000000 1.295456e+06 1.282868e+06 50.000000
In [32]:
pandas_profiling.ProfileReport(df_raw)



Out[32]:

In [37]:
df_raw.dtypes
Out[37]:
TrialId                      int64
TrialGroupId                 int64
TrialTargetId                int64
TrialPhase                  object
TrialValue                 float64
Trial                        int64
TrialCreatedDate            object
TrialDataDate               object
TrialAuthorId                int64
GoalName                    object
ClientId                     int64
GoalType                    object
CurrentGoalStatus           object
GoalDomain                  object
GoalAssessment              object
GoalCreatedDate             object
GoalInitiatedDate           object
GoalMetDate                 object
GoalInProgressDate          object
GoalHoldDate                object
GoalDiscontinuedDate        object
GoalDataType                object
GoalPercentCorrectTrend    float64
dtype: object
In [ ]:
df_raw.value_counts()